This analysis evaluates chromosomal instability by using breakpoint SV and CNV data that was formatted by 00-setup-breakpoint_data.R and mapping chromosomal break density. This notebook returns chromosomal break heatmaps and plots for each tumor type group in the plots directory.
This notebook can be run via the command line from the top directory of the repository as follows:
Rscript -e "rmarkdown::render('analyses/chromosomal-instability/01-plot-chromosomal-instability.Rmd',
clean = TRUE)"
# Set seed so heatmaps turn out the same
set.seed(2020)
# Magrittr pipe
`%>%` <- dplyr::`%>%`
Read in the custom functions needed for this analysis.
source(file.path("util", "chr-break-calculate.R"))
source(file.path("util", "chr-break-plot.R"))
# Path to data directory
data_dir <- file.path("..", "..", "data")
scratch_dir <- file.path("..", "..", "scratch")
# Path to output directory
plots_dir <- "plots"
# Path to tumor type plots output directory
hist_plots_dir <- file.path(plots_dir, "tumor-type")
# Create the hist_plots_dir if it does not exist
if (!dir.exists(hist_plots_dir)) {
dir.create(hist_plots_dir, recursive = TRUE)
}
Set up metadata
# Read in the metadata
metadata <- readr::read_tsv(file.path(data_dir, "pbta-histologies.tsv"))
Parsed with column specification:
cols(
.default = col_character(),
OS_days = [32mcol_double()[39m,
age_last_update_days = [32mcol_double()[39m,
normal_fraction = [32mcol_double()[39m,
tumor_fraction = [32mcol_double()[39m,
tumor_ploidy = [32mcol_double()[39m,
molecular_subtype = [33mcol_logical()[39m
)
See spec(...) for full column specifications.
221 parsing failures.
row col expected actual file
2606 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
2607 molecular_subtype 1/0/T/F/TRUE/FALSE Group4 '../../data/pbta-histologies.tsv'
2608 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
2609 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
2610 molecular_subtype 1/0/T/F/TRUE/FALSE Group3 '../../data/pbta-histologies.tsv'
.... ................. .................. ...... .................................
See problems(...) for more details.
Read in the union breaks densities TSV that was made by the 00-setup-breakpoint-data.R script.
breaks_densities_df <- readr::read_tsv(
file.path("breakpoint-data", "union_of_breaks_densities.tsv")) %>%
# Add on the short histology column from the metadata for plotting purposes
dplyr::inner_join(metadata %>% dplyr::select(Kids_First_Biospecimen_ID, short_histology),
by = c("samples" = "Kids_First_Biospecimen_ID"))
Parsed with column specification:
cols(
samples = [31mcol_character()[39m,
experimental_strategy = [31mcol_character()[39m,
genome_size = [32mcol_double()[39m,
breaks_count = [32mcol_double()[39m,
breaks_density = [32mcol_double()[39m
)
Load in the previously formatted breakpoint data.
breaks_list <- readr::read_rds(file.path("breakpoint-data", "breaks_lists.RDS"))
Get a vector of the biospecimen IDs.
common_samples <- unique(breaks_list$union_of_breaks$samples)
Set up chromosome size data. It just so happens that this BED file: WGS.hg38.strelka2.unpadded.bed is actually just a list of the chromosome sizes so we are using that for now.
# Set up Chr sizes
chr_sizes <- readr::read_tsv(file.path(data_dir, "WGS.hg38.strelka2.unpadded.bed"),
col_names = FALSE
) %>%
# Reformat the chromosome variable to drop the "chr"
dplyr::mutate(X1 = factor(gsub("chr", "", X1),
levels = c(1:22, "X", "Y", "M")
)) %>%
# Remove sex chromosomes
dplyr::filter(!(X1 %in% c("X", "Y", "M")))
Parsed with column specification:
cols(
X1 = [31mcol_character()[39m,
X2 = [32mcol_double()[39m,
X3 = [32mcol_double()[39m
)
# Make chromosome size named vector
chr_sizes_vector <- chr_sizes$X3
names(chr_sizes_vector) <- chr_sizes$X1
breaks_cdf <- breaks_densities_df %>%
# Only plot histologies groups with more than 5 samples
dplyr::group_by(short_histology, add = TRUE) %>%
dplyr::filter(dplyr::n() > 5) %>%
# Calculate histology group mean
dplyr::mutate(hist_mean = mean(breaks_density)) %>%
dplyr::ungroup() %>%
# Now we will plot these as cummulative distribution plots
ggplot2::ggplot(ggplot2::aes(color = short_histology, x = breaks_density)) +
ggplot2::geom_point(stat = "ecdf") +
# Add summary line for mean
ggplot2::geom_vline(ggplot2::aes(xintercept = hist_mean)) +
# Separate by histology
ggplot2::facet_wrap(~short_histology, nrow = 2) +
ggplot2::theme_classic() +
ggplot2::xlab("Chromosomal Breaks per Mb") +
ggplot2::ylab("Rank") +
# Transform to log10 make non-log y-axis labels
ggplot2::scale_x_continuous(trans = "log10", breaks = c(0, 1, 10, 30)) +
ggplot2::theme(legend.position = "none") +
ggplot2::coord_flip() +
# Get rid of x-axis ticks
ggplot2::theme(
axis.text.x = ggplot2::element_blank(),
axis.ticks.x = ggplot2::element_blank(),
strip.text = ggplot2::element_text(size = 4))
# Save the plot to a png
ggplot2::ggsave(file.path(plots_dir, "breaks_cdf_plot.png"),
plot = breaks_cdf, width = 17, height = 20, unit = "cm")
Print from png so rendering is smoother.
For each sample, we will bin the genome and count how many chromosome breaks occur for each bin, given the given bin_size.
We will run this for each sample and return a list of three GenomicRanges objects:
1) union_of_breaks contains the combined break counts for both SV and CNV break data.
2) cnv_breaks contains the number of break counts for CNV.
3) sv_breaks contains the number of break counts for SV.
# Change here and it will change the rest
bin_size <- 1e6
# Get a big list of break densities for each sample.
sample_densities <- lapply(common_samples, function(sample_id) {
lapply(breaks_list, function(breaks_df) {
break_density(breaks_df,
sample_id = sample_id,
start_col = "coord",
end_col = "coord",
window_size = bin_size,
chr_sizes_vector = chr_sizes_vector
)
})
})
Save to an RDS file
# Save to an RDS file
readr::write_rds(
sample_densities,
file.path(scratch_dir, "sample_breakpoint_densities.RDS")
)
Given the GenomicRanges objects for each sample, create a combined plot for each.
Make chromosome labeling HeatmapAnnotation object.
# Extract chromosome names from a GenomicRanges object
chr_labels <- sample_densities[[1]]$union_of_breaks@seqnames
chr_labels <- paste0("chr", as.factor(chr_labels))
# Make a key for assigning alternating colors to the chromosomes
chr_colors <- rep(c("grey", "lightblue"), length.out = length(unique(chr_labels)))
names(chr_colors) <- unique(chr_labels)
# Create the Heatmap annotation object
chr_annot <- ComplexHeatmap::HeatmapAnnotation(
df = data.frame(chr_labels), col = list(chr_labels = c(chr_colors)),
name = "",
show_legend = FALSE,
show_annotation_name = FALSE
)
Make histology labeling HeatmapAnnotation object.
# Get the histologies for the samples in this set and order them by histology
histologies <-
data.frame(Kids_First_Biospecimen_ID = common_samples) %>%
dplyr::inner_join(dplyr::select(metadata, Kids_First_Biospecimen_ID, short_histology)) %>%
dplyr::mutate(short_histology = as.factor(short_histology)) %>%
dplyr::arrange(short_histology) %>%
tibble::column_to_rownames("Kids_First_Biospecimen_ID")
Joining, by = "Kids_First_Biospecimen_ID"
Column `Kids_First_Biospecimen_ID` joining factor and character vector, coercing into character vector
# Create the Heatmap annotation object
hist_annot <- ComplexHeatmap::rowAnnotation(
df = histologies,
show_annotation_name = FALSE
)
Make a color function.
col_fun <- circlize::colorRamp2(
c(0, 2, 5, 10, 20),
c("#edf8fb", "#b2e2e2", "#66c2a4", "#2ca25f", "#006d2c")
)
Make a function for making the heatmaps.
breaks_heatmap <- function(data_name) {
# A wrapper function for making a heatmap from the samples GenomicRanges list.
#
# Args:
# data_name: a character string that matches a name in the list.
# Returns:
# A heatmap of the chromosomal breaks
# Pull out the total_counts
total_counts <- lapply(sample_densities, function(granges_list) {
granges_list[[data_name]]@elementMetadata@listData$total_counts
})
# Make into a data.frame
total_counts <- dplyr::bind_rows(total_counts) %>%
dplyr::select(rownames(histologies)) %>%
t()
# Add chromosome labels
colnames(total_counts) <- chr_labels
# Plot on a heatmap
heatmap <- ComplexHeatmap::Heatmap(total_counts,
col = col_fun,
heatmap_legend_param = list(title = "Count of chr breaks"),
cluster_columns = FALSE,
cluster_rows = FALSE,
show_column_names = FALSE,
show_row_names = FALSE,
bottom_annotation = chr_annot,
left_annotation = hist_annot
)
# Return plot
return(heatmap)
}
union_of_heatmap <- breaks_heatmap(data_name = "union_of_breaks")
# Save plot as PNG
png(file.path(plots_dir, paste0("union_of_breaks_heatmap.png")),
width = 1200, height = 900, units = "px"
)
union_of_heatmap
dev.off()
null device
1
# Print out here
union_of_heatmap
cnv_heatmap <- breaks_heatmap(data_name = "cnv_breaks")
# Save plot as PNG
png(file.path(plots_dir, paste0("cnv_breaks_heatmap.png")),
width = 1200, height = 900, units = "px"
)
cnv_heatmap
dev.off()
null device
1
# Print out here
cnv_heatmap
sv_heatmap <- breaks_heatmap(data_name = "sv_breaks")
# Save plot as PNG
png(file.path(plots_dir, paste0("sv_breaks_heatmap.png")),
width = 1200, height = 900, units = "px"
)
sv_heatmap
dev.off()
null device
1
# Print out here
sv_heatmap
Same as was done for each sample, now we will calculate densities for each tumor-type group.
# Change bin_size here and it will change the rest
bin_size <- 1e6
# Get a list of the tumor_types for which we have DNA-seq data
tumor_types <- metadata %>%
dplyr::filter(!is.na(short_histology), experimental_strategy != "RNA-Seq") %>%
dplyr::distinct(short_histology) %>%
dplyr::pull(short_histology)
Run the density calculations for the groups.
# Get a big list of break densities for each sample.
group_densities <- lapply(tumor_types, function(tumor_type) {
# Obtain a list of sample_ids to subset by
sample_ids <- metadata %>%
dplyr::filter(metadata$short_histology == tumor_type) %>%
dplyr::pull(Kids_First_Biospecimen_ID)
# Double check these samples are in the list
check_samples <- (sample_ids %in% common_samples)
# If no samples, go to next
if (sum(check_samples) > 1) {
message(paste("Calculating breakpoint density for", tumor_type, "samples"))
lapply(breaks_list, function(breaks_df) {
break_density(breaks_df,
sample_id = sample_ids,
start_col = "coord",
end_col = "coord",
window_size = bin_size,
chr_sizes_vector = chr_sizes_vector
)
})
}
})
Calculating breakpoint density for Ependymoma samples
Calculating breakpoint density for HGAT samples
Calculating breakpoint density for LGAT samples
Calculating breakpoint density for Other samples
Calculating breakpoint density for CNS sarcoma samples
Calculating breakpoint density for Schwannoma samples
Calculating breakpoint density for ATRT samples
Calculating breakpoint density for Choroid plexus tumor samples
Calculating breakpoint density for Craniopharyngioma samples
Calculating breakpoint density for DNET samples
Calculating breakpoint density for Teratoma samples
Calculating breakpoint density for Hemangioblastoma samples
Calculating breakpoint density for Meningioma samples
Calculating breakpoint density for Adenoma samples
Calculating breakpoint density for Neurofibroma samples
Calculating breakpoint density for Ganglioglioma samples
Calculating breakpoint density for Langerhans Cell histiocytosis samples
Calculating breakpoint density for CNS Embryonal tumor samples
Calculating breakpoint density for CNS neuroblastoma samples
Calculating breakpoint density for Chordoma samples
Calculating breakpoint density for MPNST samples
Calculating breakpoint density for Glial-neuronal tumor NOS samples
Calculating breakpoint density for Pineoblastoma samples
Calculating breakpoint density for CNS EFT-CIC samples
Calculating breakpoint density for Central neurocytoma samples
Calculating breakpoint density for Germinoma samples
Calculating breakpoint density for CNS Rhabdomyosarcoma samples
Calculating breakpoint density for Dysplasia samples
Calculating breakpoint density for Oligodendroglioma samples
Calculating breakpoint density for Hemangioma samples
Calculating breakpoint density for LGMT samples
Calculating breakpoint density for CNS ganglioneuroblastoma samples
Calculating breakpoint density for Medulloblastoma samples
# Bring along the tumor-type labels
names(group_densities) <- tumor_types
# Remove tumor_types data that didn't have at least two samples
group_densities <- group_densities[!sapply(group_densities, is.null)]
Save to an RDS file
# Save to an RDS file
readr::write_rds(
group_densities,
file.path(scratch_dir, "histology_breakpoint_densities.RDS")
)
Here we will plot median number of break points for the tumor-type group per each bin.
purrr::imap(group_densities, function(.x, name = .y) {
# Make the combo plot
multipanel_break_plot(
granges_list = .x,
plot_name = name,
y_val = "median_counts",
y_lab = "Median Breaks per Mb",
plot_dir = hist_plots_dir
)
})
$Ependymoma
$HGAT
$LGAT
$Other
$`CNS sarcoma`
$Schwannoma
$ATRT
$`Choroid plexus tumor`
$Craniopharyngioma
$DNET
$Teratoma
$Hemangioblastoma
$Meningioma
$Adenoma
$Neurofibroma
$Ganglioglioma
$`Langerhans Cell histiocytosis`
$`CNS Embryonal tumor`
$`CNS neuroblastoma`
$Chordoma
$MPNST
$`Glial-neuronal tumor NOS`
$Pineoblastoma
$`CNS EFT-CIC`
$`Central neurocytoma`
$Germinoma
$`CNS Rhabdomyosarcoma`
$Dysplasia
$Oligodendroglioma
$Hemangioma
$LGMT
$`CNS ganglioneuroblastoma`
$Medulloblastoma
Zip up the PNG files into one file.
# Declare name of zip file
zip_file <- paste0(hist_plots_dir, ".zip")
# Remove any current zip_file of this name so we can overwrite it
if (file.exists(zip_file)) {
file.remove(zip_file)
}
[1] TRUE
# Zip up the plots
zip(zip_file, hist_plots_dir)
adding: plots/tumor-type/ (stored 0%)
adding: plots/tumor-type/Ependymoma_breaks.png (deflated 13%)
adding: plots/tumor-type/CNS Embryonal tumor_breaks.png (deflated 12%)
adding: plots/tumor-type/Glial-neuronal tumor NOS_breaks.png (deflated 12%)
adding: plots/tumor-type/ATRT_breaks.png (deflated 13%)
adding: plots/tumor-type/Medulloblastoma_breaks.png (deflated 13%)
adding: plots/tumor-type/Oligodendroglioma_breaks.png (deflated 11%)
adding: plots/tumor-type/LGMT_breaks.png (deflated 10%)
adding: plots/tumor-type/.DS_Store (deflated 97%)
adding: plots/tumor-type/HGAT_breaks.png (deflated 13%)
adding: plots/tumor-type/Hemangioma_breaks.png (deflated 11%)
adding: plots/tumor-type/LGAT_breaks.png (deflated 14%)
adding: plots/tumor-type/Pineoblastoma_breaks.png (deflated 11%)
adding: plots/tumor-type/Dysplasia_breaks.png (deflated 13%)
adding: plots/tumor-type/MPNST_breaks.png (deflated 11%)
adding: plots/tumor-type/Central neurocytoma_breaks.png (deflated 10%)
adding: plots/tumor-type/CNS neuroblastoma_breaks.png (deflated 11%)
adding: plots/tumor-type/Neurofibroma_breaks.png (deflated 13%)
adding: plots/tumor-type/Chordoma_breaks.png (deflated 14%)
adding: plots/tumor-type/Schwannoma_breaks.png (deflated 13%)
adding: plots/tumor-type/Hemangioblastoma_breaks.png (deflated 12%)
adding: plots/tumor-type/DNET_breaks.png (deflated 14%)
adding: plots/tumor-type/Langerhans Cell histiocytosis_breaks.png (deflated 12%)
adding: plots/tumor-type/CNS Rhabdomyosarcoma_breaks.png (deflated 10%)
adding: plots/tumor-type/Meningioma_breaks.png (deflated 13%)
adding: plots/tumor-type/Ganglioglioma_breaks.png (deflated 14%)
adding: plots/tumor-type/CNS sarcoma_breaks.png (deflated 9%)
adding: plots/tumor-type/Germinoma_breaks.png (deflated 10%)
adding: plots/tumor-type/Choroid plexus tumor_breaks.png (deflated 13%)
adding: plots/tumor-type/Craniopharyngioma_breaks.png (deflated 13%)
adding: plots/tumor-type/Teratoma_breaks.png (deflated 13%)
adding: plots/tumor-type/CNS EFT-CIC_breaks.png (deflated 14%)
adding: plots/tumor-type/Other_breaks.png (deflated 13%)
adding: plots/tumor-type/Adenoma_breaks.png (deflated 11%)
adding: plots/tumor-type/CNS ganglioneuroblastoma_breaks.png (deflated 11%)
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] optparse_1.6.2 GenomicRanges_1.38.0 GenomeInfoDb_1.22.0 IRanges_2.20.1
[5] S4Vectors_0.24.1 BiocGenerics_0.32.0
loaded via a namespace (and not attached):
[1] utf8_1.1.4 reticulate_1.12 R.utils_2.9.0
[4] tidyselect_0.2.5 RSQLite_2.1.1 AnnotationDbi_1.48.0
[7] htmlwidgets_1.3 grid_3.6.0 BiocParallel_1.20.0
[10] DESeq_1.38.0 munsell_0.5.0 codetools_0.2-16
[13] withr_2.1.2 colorspace_1.4-1 Biobase_2.46.0
[16] OrganismDbi_1.28.0 knitr_1.23 rstudioapi_0.10
[19] ggsignif_0.6.0 labeling_0.3 GenomeInfoDbData_1.2.2
[22] KMsurv_0.1-5 hwriter_1.3.2 polyclip_1.10-0
[25] bit64_0.9-7 farver_1.1.0 rprojroot_1.3-2
[28] downloader_0.4 generics_0.0.2 vctrs_0.1.0
[31] xfun_0.8 biovizBase_1.34.1 ggthemes_4.2.0
[34] BiocFileCache_1.10.2 EDASeq_2.20.0 R6_2.4.0
[37] doParallel_1.0.14 clue_0.3-57 locfit_1.5-9.1
[40] AnnotationFilter_1.10.0 bitops_1.0-6 reshape_0.8.8
[43] DelayedArray_0.12.0 assertthat_0.2.1 scales_1.0.0
[46] nnet_7.3-12 gtable_0.3.0 sva_3.34.0
[49] ggbio_1.34.0 ensembldb_2.10.2 rlang_0.4.0
[52] zeallot_0.1.0 genefilter_1.68.0 cmprsk_2.2-8
[55] GlobalOptions_0.1.1 splines_3.6.0 rtracklayer_1.46.0
[58] lazyeval_0.2.2 acepack_1.4.1 dichromat_2.0-0
[61] selectr_0.4-1 broom_0.5.2 checkmate_1.9.4
[64] BiocManager_1.30.4 yaml_2.2.0 reshape2_1.4.3
[67] GenomicFeatures_1.38.0 backports_1.1.4 Hmisc_4.2-0
[70] RBGL_1.62.1 tools_3.6.0 ggplot2_3.2.0
[73] RColorBrewer_1.1-2 Rcpp_1.0.1 plyr_1.8.4
[76] base64enc_0.1-3 progress_1.2.2 zlibbioc_1.32.0
[79] purrr_0.3.2 RCurl_1.95-4.12 prettyunits_1.0.2
[82] ggpubr_0.2.4 rpart_4.1-15 openssl_1.4
[85] GetoptLong_0.1.7 cowplot_0.9.4 zoo_1.8-6
[88] SummarizedExperiment_1.16.0 ggrepel_0.8.1 cluster_2.1.0
[91] magrittr_1.5 data.table_1.12.2 circlize_0.4.8
[94] survminer_0.4.4 ProtGenerics_1.18.0 matrixStats_0.55.0
[97] aroma.light_3.16.0 hms_0.4.2 evaluate_0.14
[100] xtable_1.8-4 XML_3.98-1.20 gridExtra_2.3
[103] shape_1.4.4 compiler_3.6.0 biomaRt_2.42.0
[106] tibble_2.1.3 crayon_1.3.4 R.oo_1.22.0
[109] htmltools_0.3.6 mgcv_1.8-28 Formula_1.2-3
[112] tidyr_0.8.3 geneplotter_1.64.0 DBI_1.0.0
[115] tweenr_1.0.1 dbplyr_1.4.2 matlab_1.0.2
[118] ComplexHeatmap_2.2.0 MASS_7.3-51.4 rappdirs_0.3.1
[121] ShortRead_1.44.1 Matrix_1.2-17 getopt_1.20.3
[124] readr_1.3.1 cli_1.1.0 R.methodsS3_1.7.1
[127] km.ci_0.5-2 pkgconfig_2.0.2 GenomicAlignments_1.22.1
[130] foreign_0.8-71 xml2_1.2.0 foreach_1.4.4
[133] annotate_1.64.0 XVector_0.26.0 rvest_0.3.4
[136] stringr_1.4.0 VariantAnnotation_1.32.0 digest_0.6.20
[139] ConsensusClusterPlus_1.50.0 graph_1.62.0 Biostrings_2.54.0
[142] rmarkdown_1.13 survMisc_0.5.5 TCGAbiolinks_2.14.0
[145] htmlTable_1.13.1 edgeR_3.28.0 curl_3.3
[148] Rsamtools_2.2.1 rjson_0.2.20 nlme_3.1-140
[151] jsonlite_1.6 askpass_1.1 limma_3.42.0
[154] BSgenome_1.54.0 fansi_0.4.0 pillar_1.4.2
[157] lattice_0.20-38 GGally_1.4.0 httr_1.4.0
[160] survival_2.44-1.1 glue_1.3.1 png_0.1-7
[163] iterators_1.0.10 bit_1.1-14 ggforce_0.2.2
[166] stringi_1.4.3 rematch2_2.0.1 blob_1.1.1
[169] latticeExtra_0.6-28 memoise_1.1.0 styler_1.1.1
[172] dplyr_0.8.3